tidycensus

Author

Divij Sinha

Tidycensus

Setting the key

library(tidyverse)
library(tidycensus)
library(tigris)
library(sf)

# .gitignore !

# api_key <- read_file("week 3/api_key.txt") %>% trimws()
# census_api_key(key=api_key, install = TRUE)

Getting Decennial

dhc_20 <- tidycensus::load_variables(2020, "dhc", cache = TRUE)
dhc_20
# A tibble: 9,067 × 3
   name     label                                                        concept
   <chr>    <chr>                                                        <chr>  
 1 H10_001N " !!Total:"                                                  TENURE…
 2 H10_002N " !!Total:!!Owner occupied:"                                 TENURE…
 3 H10_003N " !!Total:!!Owner occupied:!!Householder who is White alone" TENURE…
 4 H10_004N " !!Total:!!Owner occupied:!!Householder who is Black or Af… TENURE…
 5 H10_005N " !!Total:!!Owner occupied:!!Householder who is American In… TENURE…
 6 H10_006N " !!Total:!!Owner occupied:!!Householder who is Asian alone" TENURE…
 7 H10_007N " !!Total:!!Owner occupied:!!Householder who is Native Hawa… TENURE…
 8 H10_008N " !!Total:!!Owner occupied:!!Householder who is Some Other … TENURE…
 9 H10_009N " !!Total:!!Owner occupied:!!Householder who is Two or More… TENURE…
10 H10_010N " !!Total:!!Renter occupied:"                                TENURE…
# ℹ 9,057 more rows
dhc_20 %>% 
  mutate(concept_trim = str_remove(concept, "\\(.*\\)")) %>%
  group_by(concept_trim) %>% summarise(n = n()) %>% arrange(desc(n))
# A tibble: 76 × 2
   concept_trim                                                                n
   <chr>                                                                   <int>
 1 "SEX BY SINGLE-YEAR AGE "                                                3135
 2 "SEX BY AGE FOR SELECTED AGE CATEGORIES "                                1666
 3 "GROUP QUARTERS POPULATION BY SEX BY AGE BY MAJOR GROUP QUARTERS TYPE "   567
 4 "HOUSEHOLD TYPE "                                                         477
 5 "SEX BY AGE FOR THE POPULATION IN HOUSEHOLDS "                            441
 6 "SEX BY SINGLE-YEAR AGE"                                                  209
 7 "GROUP QUARTERS POPULATION BY SEX BY AGE BY GROUP QUARTERS TYPE"          195
 8 "TENURE BY AGE OF HOUSEHOLDER "                                           189
 9 "FAMILY TYPE BY PRESENCE AND AGE OF OWN CHILDREN "                        180
10 "TENURE BY HOUSEHOLD SIZE "                                               153
# ℹ 66 more rows
dhc_20 %>% filter(str_detect(concept, "POP")) %>% 
  mutate(changed_label = str_trim(str_remove_all(label, "[!:]"))) %>%
  filter(changed_label == "Total")
# A tibble: 39 × 4
   name        label       concept                                 changed_label
   <chr>       <chr>       <chr>                                   <chr>        
 1 H8_001N     " !!Total"  TOTAL POPULATION IN OCCUPIED HOUSING U… Total        
 2 P10_001N    " !!Total:" RACE FOR THE POPULATION 18 YEARS AND O… Total        
 3 P11_001N    " !!Total:" HISPANIC OR LATINO, AND NOT HISPANIC O… Total        
 4 P14_001N    " !!Total:" SEX BY AGE FOR THE POPULATION UNDER 20… Total        
 5 P15_001N    " !!Total:" POPULATION IN HOUSEHOLDS BY AGE         Total        
 6 P18_001N    " !!Total:" GROUP QUARTERS POPULATION BY SEX BY AG… Total        
 7 P1_001N     " !!Total"  TOTAL POPULATION                        Total        
 8 PCO1_001N   " !!Total:" GROUP QUARTERS POPULATION BY SEX BY AGE Total        
 9 PCT13A_001N " !!Total:" SEX BY AGE FOR THE POPULATION IN HOUSE… Total        
10 PCT13B_001N " !!Total:" SEX BY AGE FOR THE POPULATION IN HOUSE… Total        
# ℹ 29 more rows
dhc_20 %>% filter(str_starts(name, "P1_")) %>% arrange(name)
# A tibble: 1 × 3
  name    label      concept         
  <chr>   <chr>      <chr>           
1 P1_001N " !!Total" TOTAL POPULATION
total_pop <- tidycensus::get_decennial(geography = "county", variables = "P1_001N")
Getting data from the 2020 decennial Census
Using the PL 94-171 Redistricting Data Summary File
Note: 2020 decennial Census data use differential privacy, a technique that
introduces errors into data to preserve respondent confidentiality.
ℹ Small counts should be interpreted with caution.
ℹ See https://www.census.gov/library/fact-sheets/2021/protecting-the-confidentiality-of-the-2020-census-redistricting-data.html for additional guidance.
This message is displayed once per session.
total_pop
# A tibble: 3,221 × 4
   GEOID NAME                     variable  value
   <chr> <chr>                    <chr>     <dbl>
 1 01001 Autauga County, Alabama  P1_001N   58805
 2 01003 Baldwin County, Alabama  P1_001N  231767
 3 01005 Barbour County, Alabama  P1_001N   25223
 4 01007 Bibb County, Alabama     P1_001N   22293
 5 01009 Blount County, Alabama   P1_001N   59134
 6 01011 Bullock County, Alabama  P1_001N   10357
 7 01013 Butler County, Alabama   P1_001N   19051
 8 01015 Calhoun County, Alabama  P1_001N  116441
 9 21135 Lewis County, Kentucky   P1_001N   13080
10 21137 Lincoln County, Kentucky P1_001N   24275
# ℹ 3,211 more rows
transport_vars <- tidycensus::load_variables(year=2020, dataset = "acs5", cache = TRUE) %>%
  filter(str_starts(name, "B08301")) %>% 
  filter(str_count(label, "\\!\\!") == 2) %>% select(name, label)
transport_pop <- tidycensus::get_acs(geography = "county", variables = transport_vars$name, year=2020) %>%
  left_join(transport_vars, join_by(variable == name))
Getting data from the 2016-2020 5-year ACS
transport_pop
# A tibble: 25,768 × 6
   GEOID NAME                    variable   estimate   moe label                
   <chr> <chr>                   <chr>         <dbl> <dbl> <chr>                
 1 01001 Autauga County, Alabama B08301_002    23401   847 Estimate!!Total:!!Ca…
 2 01001 Autauga County, Alabama B08301_010      109   106 Estimate!!Total:!!Pu…
 3 01001 Autauga County, Alabama B08301_016        0    29 Estimate!!Total:!!Ta…
 4 01001 Autauga County, Alabama B08301_017       43    48 Estimate!!Total:!!Mo…
 5 01001 Autauga County, Alabama B08301_018       68   113 Estimate!!Total:!!Bi…
 6 01001 Autauga County, Alabama B08301_019      183   132 Estimate!!Total:!!Wa…
 7 01001 Autauga County, Alabama B08301_020      191   150 Estimate!!Total:!!Ot…
 8 01001 Autauga County, Alabama B08301_021      954   250 Estimate!!Total:!!Wo…
 9 01003 Baldwin County, Alabama B08301_002    88321  2037 Estimate!!Total:!!Ca…
10 01003 Baldwin County, Alabama B08301_010       30    35 Estimate!!Total:!!Pu…
# ℹ 25,758 more rows
transport_pop %>% group_by(variable) %>% slice_max(order_by = estimate, n = 1) 
# A tibble: 8 × 6
# Groups:   variable [8]
  GEOID NAME                           variable   estimate   moe label          
  <chr> <chr>                          <chr>         <dbl> <dbl> <chr>          
1 06037 Los Angeles County, California B08301_002  3902247  9941 Estimate!!Tota…
2 36047 Kings County, New York         B08301_010   682380  6289 Estimate!!Tota…
3 36061 New York County, New York      B08301_016    20968  1547 Estimate!!Tota…
4 06037 Los Angeles County, California B08301_017    10861   778 Estimate!!Tota…
5 06037 Los Angeles County, California B08301_018    32169  1479 Estimate!!Tota…
6 36061 New York County, New York      B08301_019   172404  4397 Estimate!!Tota…
7 06037 Los Angeles County, California B08301_020    54593  2292 Estimate!!Tota…
8 06037 Los Angeles County, California B08301_021   384448  5604 Estimate!!Tota…
transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  # filter(value > 1e4) %>%
  mutate(pop_perc = estimate/value) %>%
  group_by(variable_transport) %>% 
  slice_max(order_by = pop_perc, n = 1) 
# A tibble: 8 × 10
# Groups:   variable_transport [8]
  GEOID NAME_transport          variable_transport estimate   moe label NAME_pop
  <chr> <chr>                   <chr>                 <dbl> <dbl> <chr> <chr>   
1 15005 Kalawao County, Hawaii  B08301_002              399   182 Esti… Kalawao…
2 36061 New York County, New Y… B08301_010           480245  5719 Esti… New Yor…
3 02150 Kodiak Island Borough,… B08301_016              459   229 Esti… Kodiak …
4 13101 Echols County, Georgia  B08301_017               60    63 Esti… Echols …
5 15005 Kalawao County, Hawaii  B08301_018                6     4 Esti… Kalawao…
6 02068 Denali Borough, Alaska  B08301_019              918  1072 Esti… Denali …
7 02158 Kusilvak Census Area, … B08301_020             1004    94 Esti… Kusilva…
8 46063 Harding County, South … B08301_021              252    90 Esti… Harding…
# ℹ 3 more variables: variable_pop <chr>, value <dbl>, pop_perc <dbl>
counties_tigris <- tigris::counties(year = 2020, cb=TRUE, resolution = "20m", progress_bar=FALSE)
counties_tigris
Simple feature collection with 3221 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.1743 ymin: 17.91377 xmax: 179.7739 ymax: 71.35256
Geodetic CRS:  NAD83
First 10 features:
   STATEFP COUNTYFP COUNTYNS       AFFGEOID GEOID       NAME          NAMELSAD
1       01      061 00161556 0500000US01061 01061     Geneva     Geneva County
2       08      125 00198178 0500000US08125 08125       Yuma       Yuma County
3       17      177 01785076 0500000US17177 17177 Stephenson Stephenson County
4       28      153 00695797 0500000US28153 28153      Wayne      Wayne County
5       34      041 00882237 0500000US34041 34041     Warren     Warren County
6       46      051 01265782 0500000US46051 46051      Grant      Grant County
7       51      013 01480097 0500000US51013 51013  Arlington  Arlington County
8       21      007 00516850 0500000US21007 21007    Ballard    Ballard County
9       37      091 01026127 0500000US37091 37091   Hertford   Hertford County
10      13      057 01685718 0500000US13057 13057   Cherokee   Cherokee County
   STUSPS     STATE_NAME LSAD      ALAND   AWATER
1      AL        Alabama   06 1487908432 11567409
2      CO       Colorado   06 6123763559 11134665
3      IL       Illinois   06 1461392061  1350223
4      MS    Mississippi   06 2099745602  7255476
5      NJ     New Jersey   06  923435921 15822933
6      SD   South Dakota   06 1764937243 15765681
7      VA       Virginia   06   67332990   273562
8      KY       Kentucky   06  639538865 70070117
9      NC North Carolina   06  914689325 18737418
10     GA        Georgia   06 1090597331 34427734
                         geometry
1  MULTIPOLYGON (((-86.19348 3...
2  MULTIPOLYGON (((-102.8038 4...
3  MULTIPOLYGON (((-89.92647 4...
4  MULTIPOLYGON (((-88.94335 3...
5  MULTIPOLYGON (((-75.19261 4...
6  MULTIPOLYGON (((-97.22624 4...
7  MULTIPOLYGON (((-77.17228 3...
8  MULTIPOLYGON (((-89.16809 3...
9  MULTIPOLYGON (((-77.20847 3...
10 MULTIPOLYGON (((-84.64429 3...
ggplot(data = counties_tigris) + geom_sf()

counties_tigris <- tigris::shift_geometry(counties_tigris)
counties_tigris
Simple feature collection with 3221 features and 12 fields
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -3112200 ymin: -1697728 xmax: 2258154 ymax: 1558935
Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic
First 10 features:
   STATEFP COUNTYFP COUNTYNS       AFFGEOID GEOID       NAME          NAMELSAD
1       01      061 00161556 0500000US01061 01061     Geneva     Geneva County
2       08      125 00198178 0500000US08125 08125       Yuma       Yuma County
3       17      177 01785076 0500000US17177 17177 Stephenson Stephenson County
4       28      153 00695797 0500000US28153 28153      Wayne      Wayne County
5       34      041 00882237 0500000US34041 34041     Warren     Warren County
6       46      051 01265782 0500000US46051 46051      Grant      Grant County
7       51      013 01480097 0500000US51013 51013  Arlington  Arlington County
8       21      007 00516850 0500000US21007 21007    Ballard    Ballard County
9       37      091 01026127 0500000US37091 37091   Hertford   Hertford County
10      13      057 01685718 0500000US13057 13057   Cherokee   Cherokee County
   STUSPS     STATE_NAME LSAD      ALAND   AWATER
1      AL        Alabama   06 1487908432 11567409
2      CO       Colorado   06 6123763559 11134665
3      IL       Illinois   06 1461392061  1350223
4      MS    Mississippi   06 2099745602  7255476
5      NJ     New Jersey   06  923435921 15822933
6      SD   South Dakota   06 1764937243 15765681
7      VA       Virginia   06   67332990   273562
8      KY       Kentucky   06  639538865 70070117
9      NC North Carolina   06  914689325 18737418
10     GA        Georgia   06 1090597331 34427734
                         geometry
1  MULTIPOLYGON (((929857.8 -6...
2  MULTIPOLYGON (((-575240.5 3...
3  MULTIPOLYGON (((495694.5 57...
4  MULTIPOLYGON (((664464.4 -6...
5  MULTIPOLYGON (((1729323 550...
6  MULTIPOLYGON (((-96128.46 8...
7  MULTIPOLYGON (((1607262 315...
8  MULTIPOLYGON (((601189.3 -2...
9  MULTIPOLYGON (((1662478 244...
10 MULTIPOLYGON (((1034040 -29...
ggplot(data = counties_tigris) + geom_sf()

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  filter(variable_transport == "B08301_016") %>%
  mutate(pop_perc = estimate/value) %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  ggplot(data = .) +
  geom_sf(aes(fill = pop_perc)) +
  scale_fill_gradient(low = "white", high = "darkgreen", na.value = "white", name = "Percentage of population") +
  ggthemes::theme_tufte() 

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  filter(variable_transport == "B08301_016") %>%
  mutate(pop_perc = estimate/value) %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  ggplot(data = .) +
  geom_sf(aes(fill = pop_perc), lwd=0.1) +
  scale_fill_gradient(low = "white", high = "darkgreen",trans = "log", na.value = "white", name = "Percentage of population", labels = scales::label_percent()) +
  ggthemes::theme_tufte() 

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  # filter(variable_transport != "B08301_002") %>%
  mutate(pop_perc = estimate/value) %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  ggplot(data = .) +
  geom_sf(aes(fill = pop_perc), color=NA) +
  scale_fill_gradient(low = "white", high = "darkgreen",trans = "log", na.value = "white", name = "Percentage of population", labels = scales::label_percent()) +
  facet_wrap(vars(label)) +
  ggthemes::theme_tufte() 

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  group_by(GEOID, NAME_transport) %>%
  summarise(
perc_non_car = (sum(estimate) - 
                sum(estimate[variable_transport != "B08301_002"])) / mean(value)
  ) %>%
  ungroup() %>% 
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  ggplot(data = .) +
  geom_sf(aes(fill = perc_non_car), color=NA) +
  scale_fill_gradient(low = "white", high = "darkgreen",trans = "log", na.value = "white", name = "Percentage of population not using cars", labels = scales::label_percent())
`summarise()` has grouped output by 'GEOID'. You can override using the
`.groups` argument.

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  group_by(GEOID, NAME_transport) %>%
  summarise(
perc_non_car = (sum(estimate) - 
                sum(estimate[variable_transport != "B08301_002"])) / mean(value)
  ) %>%
  ungroup() %>% arrange(desc(perc_non_car))
`summarise()` has grouped output by 'GEOID'. You can override using the
`.groups` argument.
# A tibble: 3,221 × 3
   GEOID NAME_transport               perc_non_car
   <chr> <chr>                               <dbl>
 1 15005 Kalawao County, Hawaii              4.87 
 2 48301 Loving County, Texas                0.938
 3 48137 Edwards County, Texas               0.579
 4 46117 Stanley County, South Dakota        0.549
 5 08047 Gilpin County, Colorado             0.539
 6 46079 Lake County, South Dakota           0.538
 7 31057 Dundy County, Nebraska              0.534
 8 48105 Crockett County, Texas              0.532
 9 48197 Hardeman County, Texas              0.527
10 20069 Gray County, Kansas                 0.526
# ℹ 3,211 more rows
total_pop  %>% filter(GEOID == "15005")
# A tibble: 1 × 4
  GEOID NAME                   variable value
  <chr> <chr>                  <chr>    <dbl>
1 15005 Kalawao County, Hawaii P1_001N     82
transport_pop %>% filter(GEOID == "15005")
# A tibble: 8 × 6
  GEOID NAME                   variable   estimate   moe label                  
  <chr> <chr>                  <chr>         <dbl> <dbl> <chr>                  
1 15005 Kalawao County, Hawaii B08301_002      399   182 Estimate!!Total:!!Car,…
2 15005 Kalawao County, Hawaii B08301_010        1     2 Estimate!!Total:!!Publ…
3 15005 Kalawao County, Hawaii B08301_016        0    12 Estimate!!Total:!!Taxi…
4 15005 Kalawao County, Hawaii B08301_017        0    12 Estimate!!Total:!!Moto…
5 15005 Kalawao County, Hawaii B08301_018        6     4 Estimate!!Total:!!Bicy…
6 15005 Kalawao County, Hawaii B08301_019       16     8 Estimate!!Total:!!Walk…
7 15005 Kalawao County, Hawaii B08301_020        0    12 Estimate!!Total:!!Othe…
8 15005 Kalawao County, Hawaii B08301_021        2     3 Estimate!!Total:!!Work…
total_pop_acs <- get_acs(geography = "county", variables = "B01001_001E", year = 2023)
Getting data from the 2019-2023 5-year ACS
total_pop_acs %>% filter(GEOID == "15005")
# A tibble: 1 × 5
  GEOID NAME                   variable   estimate   moe
  <chr> <chr>                  <chr>         <dbl> <dbl>
1 15005 Kalawao County, Hawaii B01001_001       43    17
transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  filter(value > 10000) %>%
  group_by(GEOID, NAME_transport) %>%
  summarise(
perc_non_car = (sum(estimate) - 
                sum(estimate[variable_transport != "B08301_002"])) / mean(value)
  ) %>%
  ungroup() %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  ggplot(data = .) +
  geom_sf(aes(fill = perc_non_car), color=NA) +
  scale_fill_gradient(low = "white", high = "darkgreen", na.value = "lightgray", name = "Percentage of population not using cars", labels = scales::label_percent())
`summarise()` has grouped output by 'GEOID'. You can override using the
`.groups` argument.

Other mapping stuff!

https://r-spatial.github.io/mapview/index.html

# install.packages("mapview")
library(mapview)

transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  filter(value > 10000) %>%
  group_by(GEOID, NAME_transport) %>%
  summarise(
perc_non_car = (sum(estimate) - 
                sum(estimate[variable_transport != "B08301_002"])) / mean(value)
  ) %>%
  ungroup() %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>%
  mapview::mapview(zcol = "perc_non_car", label = "NAME_transport")
`summarise()` has grouped output by 'GEOID'. You can override using the
`.groups` argument.

https://rstudio.github.io/leaflet/

# install.packages("leaflet")
library(leaflet)

prep_data <- transport_pop %>% 
  left_join(total_pop, join_by(GEOID), suffix = c("_transport", "_pop")) %>%
  filter(value > 10000) %>%
  group_by(GEOID, NAME_transport) %>%
  summarise(
perc_non_car = (sum(estimate) - 
                sum(estimate[variable_transport != "B08301_002"])) / mean(value)
  ) %>%
  ungroup() %>%
  left_join(x = counties_tigris, y = ., join_by(GEOID)) %>% 
  st_transform(crs = 4326)
`summarise()` has grouped output by 'GEOID'. You can override using the
`.groups` argument.
mypalette <- colorNumeric(
  palette = "viridis", domain = prep_data$perc_non_car,
  na.color = "transparent"
)

leaflet(prep_data) %>%
  addPolygons(
    fillColor = ~ mypalette(perc_non_car),
    stroke = FALSE,
    fillOpacity = 0.75,
  ) %>% 
  addTiles()